# TODO: can't use Boston College in example of "counterfactual high prestige"

Findings

library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(igraph)
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
## 
##     compose, simplify
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following object is masked from 'package:tibble':
## 
##     as_data_frame
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(tidygraph)
## 
## Attaching package: 'tidygraph'
## The following object is masked from 'package:igraph':
## 
##     groups
## The following object is masked from 'package:stats':
## 
##     filter
library(ggraph)
# library(smglr)
library(broom)
library(ggforce)
library(ggbeeswarm)

library(assertthat)
## 
## Attaching package: 'assertthat'
## The following object is masked from 'package:tibble':
## 
##     has_name
library(tictoc)

data_folder = '../data/'
plots_path = '../output/03_'
paper_folder = '../paper/'

## Load data ----
load(str_c(data_folder, '02_parsed.Rdata'))

nrow(individual_df)
## [1] 3448
individual_df %>% 
    filter(permanent) %>% 
    nrow()
## [1] 1809
individual_df %>% 
    filter(permanent) %>% 
    nrow() %>% 
    {./nrow(individual_df)}
## [1] 0.524652

There are 203 PhD programs producing graduate students in the data

univ_df %>%
    filter(total_placements > 0) %>%
    nrow()
## [1] 220

167 PhD programs with permanent placements

individual_df %>% 
    filter(permanent) %>% 
    count(placing_univ) %>% 
    nrow()
## [1] 179
univ_df %>% 
    filter(perm_placements > 0) %>% 
    nrow()
## [1] 180

Finding: 37 PhD programs (37/203 = 18%) produce about 50% of permanent placements

ggplot(univ_df, aes(perm_placement_rank, frac_cum_perm_placements)) + 
    geom_step() +
    scale_x_continuous(labels = scales::percent_format(), 
                       name = 'PhD Programs') +
    scale_y_continuous(labels = scales::percent_format(), 
                       name = 'Permanent Placements')
## Warning: Removed 25 row(s) containing missing values (geom_path).

univ_df %>%
    filter(frac_cum_perm_placements <= .5) %>%
    arrange(perm_placement_rank) %>%
    mutate(perm_placement_rank = row_number()) %>%
    select(perm_placement_rank, univ_name, 
           perm_placements, frac_cum_perm_placements) %>%
    knitr::kable()
perm_placement_rank univ_name perm_placements frac_cum_perm_placements
1 University of Oxford 42 0.0232172
2 Graduate Center of the City University of New York 40 0.0453289
3 University of Notre Dame 37 0.0657822
4 Princeton University 35 0.0851299
5 The Catholic University of America* 30 0.1017137
6 University of Toronto 29 0.1177446
7 Stanford University 28 0.1332228
8 University of Southern California 27 0.1481481
9 University of Michigan 26 0.1625207
10 Yale University 26 0.1768933
11 Rutgers University 26 0.1912659
12 Columbia University 25 0.2050857
13 University of North Carolina at Chapel Hill 25 0.2189055
14 University of Chicago 25 0.2327253
15 Massachusetts Institute of Technology 24 0.2459923
16 Pennsylvania State University 23 0.2587065
17 University of California, Berkeley 23 0.2714207
18 New York University 23 0.2841349
19 Katholieke Universiteit Leuven 23 0.2968491
20 Emory University 22 0.3090105
21 Harvard University 22 0.3211719
22 University of California, Los Angeles 21 0.3327805
23 Vanderbilt University 21 0.3443892
24 University of Wisconsin-Madison 21 0.3559978
25 Duquesne University 20 0.3670536
26 Boston College 20 0.3781095
27 University of Arizona 20 0.3891653
28 The New School 20 0.4002211
29 Baylor University 20 0.4112769
30 Saint Louis University 19 0.4217800
31 University of South Florida 19 0.4322830
32 Cornell University 19 0.4427861
33 University of Virginia 18 0.4527363
34 Australian National University 18 0.4626866
35 Stony Brook University 18 0.4726368
36 Villanova University 17 0.4820343
37 DePaul University 17 0.4914317
## Placements at PhD-producing programs
grad_programs = univ_df %>% 
    filter(total_placements > 0, country %in% c('U.S.')) %>% 
    pull(univ_id)
# gp2 = individual_df %>% 
#     count(placing_univ_id) %>% 
#     pull(placing_univ_id)

individual_df %>% 
    filter(permanent, hiring_univ_id %in% grad_programs) %>% 
    filter(placing_univ_id %in% grad_programs) %>%
    filter(position_type == 'Tenure-Track') %>%
    count(placing_univ) %>%
    arrange(desc(n)) %>% 
    rename(phd_program_placements = n) %>% 
    mutate(cum_phd_program_placements = cumsum(phd_program_placements), 
           share_phd_program_placements = cum_phd_program_placements / sum(phd_program_placements)) %>% 
    slice(1:20) %>% 
    knitr::kable(digits = 2)
placing_univ phd_program_placements cum_phd_program_placements share_phd_program_placements
Princeton University 18 18 0.06
University of California, Berkeley 13 31 0.11
Harvard University 12 43 0.15
Massachusetts Institute of Technology 12 55 0.19
New York University 12 67 0.24
Rutgers University 12 79 0.28
Stanford University 10 89 0.31
University of Chicago 10 99 0.35
Yale University 10 109 0.39
University of Arizona 9 118 0.42
University of California, Irvine (LPS) 8 126 0.45
Columbia University 7 133 0.47
Pennsylvania State University 7 140 0.49
University of California, Los Angeles 7 147 0.52
University of Michigan 7 154 0.54
University of North Carolina at Chapel Hill 7 161 0.57
University of Notre Dame 7 168 0.59
DePaul University 6 174 0.61
University of Pittsburgh 6 180 0.64
University of Pittsburgh (HPS) 6 186 0.66

Build network

## build network ----
## NB Only permanent placements
hiring_network = individual_df %>%
    filter(permanent) %>% 
    select(placing_univ_id, hiring_univ_id, everything()) %>%
    graph_from_data_frame(directed = TRUE, 
                          vertices = univ_df) %>%
    as_tbl_graph() %>%
    ## Clean nodes (universities) that aren't in the network
    mutate(degree = centrality_degree(mode = 'total')) %>%
    filter(degree > 0)

hiring_network
## # A tbl_graph: 936 nodes and 1809 edges
## #
## # A directed multigraph with 17 components
## #
## # Node Data: 936 × 23 (active)
##   name  univ_name cluster_2 cluster_3 cluster_6 cluster_label city  state
##   <chr> <chr>     <chr>     <chr>     <chr>     <chr>         <chr> <chr>
## 1 372   Adelphi … <NA>      <NA>      <NA>      <NA>          Gard… NY   
## 2 117   Air Forc… <NA>      <NA>      <NA>      <NA>          U.S.… CO   
## 3 10409 Al-Asmar… <NA>      <NA>      <NA>      <NA>          <NA>  <NA> 
## 4 992   Albany M… <NA>      <NA>      <NA>      <NA>          Alba… NY   
## 5 353   Albany S… <NA>      <NA>      <NA>      <NA>          Alba… GA   
## 6 764   Alberto … <NA>      <NA>      <NA>      <NA>          Sant… <NA> 
## # … with 930 more rows, and 15 more variables: country <chr>,
## #   total_placements <int>, perm_placements <int>, perm_placement_rate <dbl>,
## #   frac_perm_placements <dbl>, cum_perm_placements <int>,
## #   frac_cum_perm_placements <dbl>, perm_placement_rank <dbl>,
## #   aos_diversity <dbl>, m_count <dbl>, w_count <dbl>, o_count <dbl>,
## #   gender_na_count <dbl>, frac_w <dbl>, degree <dbl>
## #
## # Edge Data: 1,809 × 14
##    from    to person_id gender ethnicity race  aos_category aos  
##   <int> <int>     <int> <chr>  <chr>     <chr> <chr>        <chr>
## 1   696   757         1 m      4         7     Science, Lo… Cogn…
## 2   209   910         2 m      2         5     History and… Medi…
## 3   693   727         3 m      4         7     LEMM         Lang…
## # … with 1,806 more rows, and 6 more variables: graduation_year <int>,
## #   placing_univ <chr>, placement_year <int>, hiring_univ <chr>,
## #   position_type <chr>, permanent <lgl>
assert_that(length(E(hiring_network)) == {individual_df %>% 
        filter(permanent) %>% 
        nrow()}, 
        msg = 'Number of edges in network ≠ number of perm. placements')
## [1] TRUE
## 1 giant component contains almost all of the schools
components(hiring_network)$csize
##  [1] 902   2   5   2   2   2   2   1   3   2   2   2   2   2   2   2   1

Out- and in-centrality

to_reverse = function (graph) {
    ## Reverse edge direction
    if (!is.directed(graph))
        return(graph)
    e <- get.data.frame(graph, what="edges")
    ## swap "from" & "to"
    neworder <- 1:length(e)
    neworder[1:2] <- c(2,1)
    e <- e[neworder]
    names(e) <- names(e)[neworder]
    vertex_df = get.data.frame(graph, what = 'vertices')
    if (ncol(vertex_df) > 0) {
        return(as_tbl_graph(graph.data.frame(e, vertices = vertex_df)))
    } else {
        return(as_tbl_graph(graph.data.frame(e)))
    }
}

set.seed(42)
nzmin = function(x, na.rm = TRUE) {
    min(x[x > 0], na.rm = na.rm)
}
damping = .3
hiring_network = hiring_network %>%
    mutate(in_centrality = centrality_eigen(weights = NA, 
                                            directed = TRUE), 
           in_pr = centrality_pagerank(weights = NA, 
                                       directed = TRUE, 
                                       damping = damping, 
                                       personalized = NULL)) %>%
    morph(to_reverse) %>%
    mutate(out_centrality = centrality_eigen(weights = NA,
                                             directed = TRUE), 
           out_pr = centrality_pagerank(weights = NA, 
                                        directed = TRUE, 
                                        damping = damping, 
                                        personalized = NULL)) %>%
    unmorph() %>%
    ## Trim 0s to minimum non-zero values
    mutate(out_centrality = ifelse(out_centrality == 0,
                                   nzmin(out_centrality),
                                   out_centrality),
           in_centrality = ifelse(in_centrality == 0,
                                  nzmin(in_centrality),
                                  in_centrality))

## Check relationship btwn unweighted multiedges and weighted edges
## Strong correlation, esp by approx rank and high/low prestige
## But some movement, both numerically and ordinally
# E(hiring_network)$weight = count_multiple(hiring_network)
# hiring_network_simp = simplify(hiring_network)
# 
# set.seed(42)
# V(hiring_network_simp)$out_centrality = hiring_network_simp %>%
#     graph.reverse() %>%
#     eigen_centrality(directed = TRUE) %>%
#     .$vector
# 
# tibble(name = V(hiring_network)$univ_name,
#        multi = V(hiring_network)$out_centrality,
#        simp = V(hiring_network_simp)$out_centrality) %>%
#     ggplot(aes(log10(simp), log10(multi))) + 
#     geom_point() +
#     stat_function(fun = function (x) x)

Exploring centrality scores

## exploring centrality scores ----

PageRank centrality is almost entirely determined by degree So we use eigenvector centrality instead

ggplot(as_tibble(hiring_network), aes(degree, log10(out_pr))) +
    geom_point(aes(text = univ_name)) +
    geom_smooth(method = 'lm')
## Warning: Ignoring unknown aesthetics: text
## `geom_smooth()` using formula 'y ~ x'

There are two clear groups of centrality scores, with high scores (in the range of ~10^-12 to 1) and low scores (10^-15 and smaller).

##
## NB there seem to be (small?) differences in scores (at the low end?) across runs of the centrality algorithm
ggplot(as_tibble(hiring_network), aes(out_centrality)) + 
    # geom_density() +
    geom_histogram(binwidth = 1, fill = 'white', color = 'black') +
    geom_rug() +
    scale_x_continuous(trans = 'log10', 
                       name = 'Out centrality', 
                       breaks = scales::trans_breaks("log10", function(x) 10^x),
                       labels = scales::trans_format("log10", scales::math_format(10^.x))) +
    facet_zoom(x = out_centrality > 10^-12, 
               ylim = c(0, 20),
               # ylim = c(0, .02),
               show.area = FALSE, 
               shrink = TRUE,
               zoom.size = .3,
               horizontal = FALSE) +
    theme_bw()

ggsave(str_c(plots_path, 'out_centrality.png'), 
       height = 3, width = 6)
ggsave(str_c(paper_folder, 'fig_out_density.png'), 
       height = 3, width = 6)

ggplot(as_tibble(hiring_network), aes(in_centrality)) + 
    geom_density() + geom_rug() +
    scale_x_continuous(trans = 'log10')

ggplot(as_tibble(hiring_network), 
       aes(out_centrality, in_centrality, 
           color = cluster_label, 
           text = univ_name)) +
    geom_jitter() + 
    scale_x_log10() + scale_y_log10()

plotly::ggplotly()
# ## Check stability of centrality calculation
# iterated_centrality = 1:500 %>%
#     map(~ {hiring_network %>%
#             graph.reverse() %>%
#             eigen_centrality(directed = TRUE, weights = NA) %>%
#             .$vector}) %>%
#     transpose() %>%
#     map(unlist) %>%
#     map(~ tibble(out_centrality = .)) %>%
#     bind_rows(.id = 'univ_id') %>%
#     group_by(univ_id) %>%
#     summarize(min = min(out_centrality),
#               max = max(out_centrality),
#               median = median(out_centrality))
# 
# ## Lots of variation among low-prestige
# ggplot(iterated_centrality, 
#        aes(reorder(univ_id, median), 
#            median)) + 
#     geom_pointrange(aes(ymin = min, ymax = max)) + 
#     scale_y_log10()
# 
# ## But extremely stable results among high-prestige
# iterated_centrality %>%
#     filter(min > 10^-12) %>%
#     ggplot(aes(reorder(univ_id, median), 
#                median)) + 
#     geom_pointrange(aes(ymin = min, ymax = max)) + 
#     scale_y_log10()

We completely rewire the network, preserving out-degree (number of new PhDs) and in-degree (number of permanent hires), but otherwise randomly matching job-seekers to positions. We then recalculate out-centrality. The plots below show the out-centrality distributions for each random rewiring, with the actual distribution in red. The distributions are always bimodal, indicating that the observed split into high- and low-centrality programs is due in part to the way PhD production and hiring are distributed across institutions. However, the high-centrality subset is never as small as in the actual hiring network. This indicates that bimodiality is not due only to the degree distributions. Some other factor is exacerbating the structural inequality in the hiring network.

set.seed(78910)
map(1:100, 
    ~ sample_degseq(degree(hiring_network, mode = 'out'), 
                    degree(hiring_network, mode = 'in'))
) %>%
    map(to_reverse) %>%
    map(eigen_centrality, directed = TRUE, weights = NULL) %>%
    map(~ .$vector) %>%
    map(log10) %>%
    map(density) %>%
    map(~ tibble(x = .$x, y = .$y)) %>%
    bind_rows(.id = 'iteration') %>%
    ggplot(aes(x, y)) + 
    geom_line(aes(group = iteration), alpha = .5) + 
    stat_density(data = as_tibble(hiring_network), geom = 'line',
                 aes(log10(out_centrality), ..density..),
                 color = 'red', size = 1)

Finding: High output is only modestly correlated w/ centrality. Programs like Leuven, New School, and Boston College produce lots of PhDs, but they aren’t placed into the high-centrality departments.

ggplot(as_tibble(hiring_network), 
       aes(total_placements, log10(out_centrality))) +
    geom_point(aes(text = univ_name)) +
    geom_smooth()
## Warning: Ignoring unknown aesthetics: text
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 735 rows containing non-finite values (stat_smooth).
## Warning: Removed 735 rows containing missing values (geom_point).

plotly::ggplotly()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 735 rows containing non-finite values (stat_smooth).
# univ_df %>%
#     mutate(out_centrality_log = log10(out_centrality)) %>%
#     filter(!is.na(out_centrality_log) &
#                out_centrality > 0 &
#                !is.na(total_placements)) %>%
#     select(total_placements, out_centrality_log) %>%
#     cor()
hiring_network %>%
    select(total_placements, out_centrality) %>%
    as_tibble() %>%
    filter(complete.cases(.)) %>%
    mutate(out_centrality = log10(out_centrality)) %>%
    cor()
##                  total_placements out_centrality
## total_placements        1.0000000      0.5480276
## out_centrality          0.5480276      1.0000000
ggplot(as_tibble(hiring_network), 
       aes(perm_placement_rank, log10(out_centrality))) +
    geom_point() +
    geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 735 rows containing non-finite values (stat_smooth).

## Warning: Removed 735 rows containing missing values (geom_point).

ggplot(as_tibble(hiring_network), 
       aes(log10(out_centrality), perm_placement_rate)) +
    geom_point(aes(text = univ_name)) +
    geom_smooth(method = 'lm')
## Warning: Ignoring unknown aesthetics: text
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 735 rows containing non-finite values (stat_smooth).

## Warning: Removed 735 rows containing missing values (geom_point).

plotly::ggplotly()
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 735 rows containing non-finite values (stat_smooth).
as_tibble(hiring_network) %>%
    select(total_placements, perm_placement_rate, 
           out_centrality) %>%
    filter(complete.cases(.)) %>%
    mutate(out_centrality = log10(out_centrality)) %>%
    cor()
##                     total_placements perm_placement_rate out_centrality
## total_placements          1.00000000          0.02747104      0.5480276
## perm_placement_rate       0.02747104          1.00000000      0.1713565
## out_centrality            0.54802764          0.17135653      1.0000000
## Add centrality scores to univ_df
univ_df = hiring_network %>%
    as_tibble() %>%
    rename(univ_id = name) %>%
    left_join(univ_df, .)
## Joining, by = c("univ_id", "univ_name", "cluster_2", "cluster_3", "cluster_6", "cluster_label", "city", "state", "country", "total_placements", "perm_placements", "perm_placement_rate", "frac_perm_placements", "cum_perm_placements", "frac_cum_perm_placements", "perm_placement_rank", "aos_diversity", "m_count", "w_count", "o_count", "gender_na_count", "frac_w")
## Movement down the prestige hierarchy
individual_df %>%
    filter(permanent) %>%
    left_join(select(univ_df, univ_id, out_centrality),
              by = c('placing_univ_id' = 'univ_id')) %>%
    left_join(select(univ_df, univ_id, out_centrality),
              by = c('hiring_univ_id' = 'univ_id')) %>%
    # filter(out_centrality.y > 10^-12) %>%
    select(person_id, aos_category,
           placing = out_centrality.x,
           hiring = out_centrality.y) %>%
    gather(variable, value, placing, hiring) %>%
    mutate(variable = fct_rev(variable)) %>%
    ggplot(aes(variable, log10(value))) +
    geom_point() +
    geom_line(aes(group = person_id), 
              alpha = .25) +
    xlab('university') +
    ylab('centrality (log10)') +
    theme_minimal()
## Warning: attributes are not identical across measure variables;
## they will be dropped

ggsave(str_c(plots_path, 'prestige_movement.png'), 
       width = 3, height = 4)
ggsave(str_c(paper_folder, 'fig_crossing.png'), 
       width = 3, height = 4)

## Diagonal line indicates where hiring program has the same centrality as the placing program.  
## Most placements are below this line, indicating that the centrality measure captures the idea that people typically are hired by programs with lower status
## The few placements above the line indicate that, even when individuals are hired by programs with higher status, the increase is relatively minor:  no more than 1 point on the log10 scale
individual_df %>% 
    filter(permanent) %>%
    left_join(select(univ_df, univ_id, out_centrality), 
              by = c('placing_univ_id' = 'univ_id')) %>%
    left_join(select(univ_df, univ_id, out_centrality),
              by = c('hiring_univ_id' = 'univ_id')) %>%
    filter(out_centrality.y > 10^-12) %>%
    select(person_id, 
           placing = out_centrality.x, 
           hiring = out_centrality.y) %>%
    ggplot(aes(log10(placing), log10(hiring))) + 
    geom_jitter() + 
    stat_function(fun = identity)

Community detection

## community detection ----
## Extract giant component
hiring_network_gc = hiring_network %>%
    components %>% 
    .$csize %>%
    {which(. == max(.))} %>%
    {components(hiring_network)$membership == .} %>%
    which() %>%
    induced_subgraph(hiring_network, .) %>%
    as_tbl_graph()
walk_len = rep(2:100, 1)
## ~24 sec
tic()
comm_stats = walk_len %>%
    map(~ cluster_walktrap(hiring_network_gc, steps = .x)) %>%
    map(~ list(sizes = sizes(.x), length = length(.x))) %>%
    map_dfr(~ tibble(H = -sum(.x$sizes/sum(.x$sizes) * log2(.x$sizes/sum(.x$sizes))),
                     n_comms = .x$length)) %>%
    mutate(walk_len = walk_len,
           delta_H = log2(n_comms) - H)
toc()
## 21.327 sec elapsed
ggplot(comm_stats, aes(walk_len, H)) +
    geom_point() +
    geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(comm_stats, aes(walk_len, n_comms)) +
    geom_point() +
    geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(comm_stats, aes(n_comms, H)) +
    geom_text(aes(label = walk_len))

Select the walk length that minimizes both delta_H (flatter community distribution) and n_comms (fewer communities) using regression residuals

walk_length = lm(H ~ n_comms, data = comm_stats) %>%
    augment(comm_stats) %>%
    # ggplot(aes(walk_len, .resid)) +
    # geom_text(aes(label = walk_len))
    arrange(desc(.resid)) %>%
    pull(walk_len) %>%
    first()

walk_length
## [1] 5
communities = cluster_walktrap(hiring_network_gc, steps = walk_length)
V(hiring_network_gc)$community = membership(communities)

Analysis of community detection

## Summary of community sizes
hiring_network_gc %>% 
    as_tibble() %>% 
    count(community) %>% 
    pull(n) %>% 
    summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   4.000   7.000   9.495  11.000 113.000
univ_df = univ_df %>%
    left_join({hiring_network_gc %>%
            as_tibble() %>% 
            select(univ_id = name, community) %>%
            mutate(community = as.character(community))})
## Joining, by = "univ_id"
univ_df %>% 
    filter(!is.na(community)) %>% 
    count(community) %>% 
    ggplot(aes(n)) +
    geom_bar(aes(text = n)) +
    scale_x_continuous(name = 'Community size (# programs)')
## Warning: Ignoring unknown aesthetics: text

plotly::ggplotly()
cluster_vars = univ_df %>% 
    select(matches('cluster')) %>% 
    names()

univ_df %>% 
    filter(!is.na(community), !is.na('cluster_lvl4'))
## # A tibble: 902 × 28
##    univ_id univ_name     cluster_2 cluster_3 cluster_6 cluster_label city  state
##    <chr>   <chr>         <chr>     <chr>     <chr>     <chr>         <chr> <chr>
##  1 372     Adelphi Univ… <NA>      <NA>      <NA>      <NA>          Gard… NY   
##  2 117     Air Force Ac… <NA>      <NA>      <NA>      <NA>          U.S.… CO   
##  3 10409   Al-Asmarya U… <NA>      <NA>      <NA>      <NA>          <NA>  <NA> 
##  4 992     Albany Medic… <NA>      <NA>      <NA>      <NA>          Alba… NY   
##  5 353     Albany State… <NA>      <NA>      <NA>      <NA>          Alba… GA   
##  6 764     Alberto Hurt… <NA>      <NA>      <NA>      <NA>          Sant… <NA> 
##  7 3393    Allen Univer… <NA>      <NA>      <NA>      <NA>          Colu… SC   
##  8 3513    Amarillo Col… <NA>      <NA>      <NA>      <NA>          Amar… TX   
##  9 385     American Uni… <NA>      <NA>      <NA>      <NA>          Wash… DC   
## 10 1134    American Uni… <NA>      <NA>      <NA>      <NA>          <NA>  NULL 
## # … with 892 more rows, and 20 more variables: country <chr>,
## #   total_placements <int>, perm_placements <int>, perm_placement_rate <dbl>,
## #   frac_perm_placements <dbl>, cum_perm_placements <int>,
## #   frac_cum_perm_placements <dbl>, perm_placement_rank <dbl>,
## #   aos_diversity <dbl>, m_count <dbl>, w_count <dbl>, o_count <dbl>,
## #   gender_na_count <dbl>, frac_w <dbl>, degree <dbl>, in_centrality <dbl>,
## #   in_pr <dbl>, out_centrality <dbl>, out_pr <dbl>, community <chr>

Finding: There is no correlation between semantic clusters and topological communities.

univ_df %>%
    filter(!is.na(community), !is.na(cluster_label)) %>%
    select(community, cluster_label) %>%
    table() %>%
    chisq.test(simulate.p.value = TRUE)
## 
##  Pearson's Chi-squared test with simulated p-value (based on 2000
##  replicates)
## 
## data:  .
## X-squared = 164.16, df = NA, p-value = 0.05547
univ_df %>%
    filter(!is.na(community), !is.na(cluster_label)) %>%
    count(community, cluster_label) %>%
    rename(cluster_n = n) %>%
    group_by(community) %>%
    mutate(community_tot = sum(cluster_n), 
           cluster_frac = cluster_n / community_tot, 
           H = sum(cluster_frac * log2(cluster_frac))) %>%
    ungroup() %>%
    ggplot(aes(fct_reorder(community, community_tot, .desc = FALSE),
               cluster_n, fill = cluster_label)) + 
    geom_col() + 
    coord_flip() +
    xlab('Topological communities') +
    ylab('# of schools in community') +
    scale_fill_brewer(palette = 'Set1', 
                      name = 'Semantic\nclusters')

High-prestige universities

## high-prestige universities ----
## Start w/ NYU, and work upstream
## Only need to go 11 or 12 steps to get closure
1:25 %>%
    map(~ make_ego_graph(hiring_network, order = .x, 
                         nodes = '34', mode = 'in')) %>%
    flatten() %>%
    map(~ length(V(.x))) %>%
    tibble(order = 1:length(.), 
           size = unlist(.)) %>%
    ggplot(aes(order, size)) + geom_point()

prestigious = make_ego_graph(hiring_network, order = 12, 
                             nodes = '34', 
                             mode = 'in') %>%
    .[[1]] %>%
    as_tbl_graph()

## How large is the high-prestige community?  
## 56 programs; 7% of all programs in the network; 
## 28% of programs with at least 1 placement in the dataset
length(V(prestigious))
## [1] 83
length(V(prestigious)) / length(V(hiring_network))
## [1] 0.08867521
length(V(prestigious)) / sum(univ_df$total_placements > 0, na.rm = TRUE)
## [1] 0.3772727
## What fraction of hires are within high-prestige?  
## 13% of all permanent hires; 7% of all hires
length(E(prestigious)) / length(E(hiring_network))
## [1] 0.1796573
length(E(prestigious)) / nrow(individual_df)
## [1] 0.09425754
# layout_prestigious = layout_with_focus(prestigious, 
#                                        which(V(prestigious)$univ_name == 'New York University')) %>% 
#     `colnames<-`(c('x', 'y')) %>% 
#     as_tibble()
layout_prestigious = create_layout(prestigious, 'focus', 
                                   focus = which(V(prestigious)$univ_name == 'New York University'))

# png(file = '02_prestigious_net.png', 
#     width = 11, height = 11, units = 'in', res = 400)
# set.seed(24)
ggraph(layout_prestigious) + 
    geom_node_label(aes(label = univ_name, 
                        size = log10(out_centrality), 
                        fill = log10(out_centrality)),
                    color = 'white') + 
    geom_edge_parallel(arrow = arrow(length = unit(.01, 'npc')), 
                       alpha = .25,
                       strength = 5) +
    scale_size_continuous(range = c(.5, 3), guide = 'none') +
    scale_fill_viridis(name = 'out centrality (log10)') +
    coord_cartesian(xlim = c(-3.5, 5.5), clip = 'on') +
    theme_graph() +
    theme(legend.position = 'bottom', 
          plot.margin = margin(0, unit = 'cm'))
## Warning: Ignoring unknown parameters: strength

ggsave(str_c(plots_path, 'prestigious_net.png'), 
       width = 6, height = 4, dpi = 600, scale = 2)
ggsave(str_c(paper_folder, 'fig_prestigious_net.png'), 
       width = 6, height = 4, dpi = 600, scale = 2)
# dev.off()

## High-prestige = high centrality group
univ_df = univ_df %>%
    mutate(prestige = ifelse(univ_id %in% V(prestigious)$name, 
                             'high-prestige', 
                             'low-prestige'))
V(hiring_network)$prestigious = V(hiring_network)$out_centrality > 1e-12
V(hiring_network_gc)$prestigious = V(hiring_network_gc)$out_centrality > 1e-12

ggplot(univ_df, aes(prestige, log10(out_centrality))) + 
    geom_jitter()
## Warning: Removed 304 rows containing missing values (geom_point).

## Output alphabetical list of high-prestige programs
high_prestige_tab = univ_df %>% 
    filter(prestige == 'high-prestige') %>% 
    select(univ_name, cluster = cluster_label, 
           perm_placement_rate,
           country, 
           # out_centrality
    ) %>% 
    # arrange(desc(out_centrality)) %>% view()
    arrange(univ_name) %>% 
    mutate(perm_placement_rate = scales::percent_format(accuracy = 2)(perm_placement_rate)) %>% 
    knitr::kable(col.names = c('Program', 'Cluster', 
                               'Placement Rate',
                               'Country'),
                 format = 'latex', 
                 longtable = TRUE,
                 booktabs = TRUE, 
                 # table.envir = 'sidewaystable',
                 label = 'high.prestige', 
                 caption = 'High-prestige programs, in alphabetical order.  Placement rate refers to placements in permanent academic positions, out of all graduates.')

write_file(high_prestige_tab, path = str_c(plots_path, 'high_prestige.tex'))
## Warning: The `path` argument of `write_file()` is deprecated as of readr 1.4.0.
## Please use the `file` argument instead.
write_file(high_prestige_tab, path = str_c(paper_folder, 'tab_high_prestige.tex'))

## Prestige status and clusters
## High-prestige are spread throughout, but #5 is mostly low-prestige
univ_df %>%
    filter(!is.na(cluster_label)) %>%
    ggplot(aes(cluster_label, color = prestige)) + 
    geom_point(stat = 'count') +
    geom_line(aes(group = prestige), stat = 'count')

## High-prestige are mostly in the largest community
univ_df %>%
    filter(!is.na(community)) %>%
    ggplot(aes(as.integer(community), fill = prestige)) + 
    geom_bar()

## What fraction of high-prestige graduates end up in high-prestige programs? 
## 24% of those w/ permanent placements
individual_df %>%
    filter(permanent) %>%
    left_join(univ_df, by = c('placing_univ_id' = 'univ_id')) %>%
    left_join(univ_df, by = c('hiring_univ_id' = 'univ_id')) %>%
    select(placing = prestige.x, hiring = prestige.y) %>%
    count(placing, hiring) %>%
    group_by(placing) %>%
    mutate(share = n / sum(n))
## # A tibble: 3 × 4
## # Groups:   placing [2]
##   placing       hiring            n share
##   <chr>         <chr>         <int> <dbl>
## 1 high-prestige high-prestige   325 0.261
## 2 high-prestige low-prestige    920 0.739
## 3 low-prestige  low-prestige    564 1

Finding: Median permanent placement rate for high-prestige programs is 22 points higher than for low-prestige programs. However, variation is also wide within each group;
This is also not yet controlling for graduation year, area, or demographics.

ggplot(univ_df, aes(prestige, perm_placement_rate, 
                    label = univ_name)) + 
    # geom_sina(aes(size = total_placements), 
    #           alpha = .5) +
    geom_beeswarm(aes(size = total_placements), 
                  priority = 'density',
                  cex = 2,
                  alpha = .5) +
    geom_violin(color = 'red', draw_quantiles = .5, 
                scale = 'count',
                fill = 'transparent') +
    # geom_jitter(aes(size = total_placements)) +
    scale_x_discrete(expand = expand_scale(add=c(0, 1))) +
    scale_y_continuous(labels = scales::percent_format()) +
    ylab('permanent placement rate') +
    scale_size(name = 'total placements') +
    theme_minimal()
## Warning: `expand_scale()` is deprecated; use `expansion()` instead.
## Warning: Removed 1020 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1020 rows containing missing values (position_beeswarm).

ggsave(str_c(plots_path, 'prestige_placement.png'), 
       width = 8, height = 4)
## Warning: Removed 1020 rows containing non-finite values (stat_ydensity).

## Warning: Removed 1020 rows containing missing values (position_beeswarm).
ggsave(str_c(paper_folder, 'fig_placement.png'), 
       width = 8, height = 4)
## Warning: Removed 1020 rows containing non-finite values (stat_ydensity).

## Warning: Removed 1020 rows containing missing values (position_beeswarm).
plotly::ggplotly()
## Warning: Removed 1020 rows containing non-finite values (stat_ydensity).

## Warning: Removed 1020 rows containing missing values (position_beeswarm).
univ_df %>%
    group_by(prestige) %>%
    summarize_at(vars(perm_placement_rate), 
                 funs(median, max, min), na.rm = TRUE)
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## # A tibble: 2 × 4
##   prestige      median   max   min
##   <chr>          <dbl> <dbl> <dbl>
## 1 high-prestige  0.579     1 0.188
## 2 low-prestige   0.385     1 0
## prestige stability ----

Stability of prestige status

By randomly rewiring the network, we can test the stability of the prestige categories to data errors and the short time frame of our data. In each of 500 permutations, we randomly rewire 10% of the edges in the permanent hiring network, then calculate out-centralities on the rewired network. Using a threshold of 10^-12 for high-prestige status, we count the fraction of rewired networks in which each program is high-prestige.

## ~6 sec
tic()
set.seed(13579)
permutations = 1:500 %>%
    ## Rewire 10% of edges
    map(~ {hiring_network %>%
            rewire(keeping_degseq(loops = TRUE, 
                                  niter = floor(.05 * length(E(.)))))}) %>%
    ## Calculate out-centralities
    map(~ {.x %>%
            to_reverse() %>%
            eigen_centrality(directed = TRUE, weights = NA) %>%
            .$vector}) %>%
    transpose() %>%
    map(unlist) %>%
    ## Fraction where program is high-prestige
    map(~ sum(. > 10^-12) / length(.)) %>%
    map(~ tibble(frac_high_prestige = .)) %>%
    bind_rows(.id = 'univ_id')
toc()
## 4.919 sec elapsed
## frac_high_prestige indicates the fraction of permutations in which the program was high-prestige
ggplot(permutations, aes(frac_high_prestige)) + 
    geom_density() + 
    geom_rug()

univ_df = left_join(univ_df, permutations)
## Joining, by = "univ_id"

Finding: Counterfactual high-prestige status is strongly correlated with centrality ranking.

ggplot(univ_df, aes(log10(out_centrality), frac_high_prestige)) + 
    geom_point(aes(text = univ_name)) +
    geom_smooth(method = 'lm')
## Warning: Ignoring unknown aesthetics: text
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 304 rows containing non-finite values (stat_smooth).
## Warning: Removed 304 rows containing missing values (geom_point).

plotly::ggplotly()
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 304 rows containing non-finite values (stat_smooth).
ggplot(univ_df, aes(perm_placements, frac_high_prestige, color = prestige)) + 
    geom_point(aes(text = univ_name)) +
    geom_smooth(method = 'lm')
## Warning: Ignoring unknown aesthetics: text
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1039 rows containing non-finite values (stat_smooth).
## Warning: Removed 1039 rows containing missing values (geom_point).

plotly::ggplotly()
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1039 rows containing non-finite values (stat_smooth).

Finding: For low-prestige programs, counterfactual prestige seems to depend on the extent of the program’s downstream hiring network. Compare Boston College (19 permanent placements; 40% high prestige) to Leuven (18 permanent placements; 20% high prestige).

focal_nodes = hiring_network %>% 
    as_tibble() %>% 
    filter(univ_name %in% c('Villanova University', 'Katholieke Universiteit Leuven')) %>% 
    pull(name)

focal_nodes_net = make_ego_graph(hiring_network, order = 10, 
                                      nodes = focal_nodes,
                                      mode = 'out') %>% 
    map(as_tbl_graph) %>%
    reduce(bind_graphs) %>% 
    mutate(perm_placements = ifelse(is.na(perm_placements), 0, perm_placements))

focal_nodes_layout = create_layout(focal_nodes_net, 'stress')

ggraph(focal_nodes_layout) + 
    geom_node_label(aes(label = univ_name, 
                        size = perm_placements,
                        fill = perm_placements), 
                    color = 'white') + 
    geom_edge_parallel(arrow = arrow(angle = 45,
                                     length = unit(.1, 'inches'),
                                     type = 'closed'),
                       alpha = .3) +
    scale_size_continuous(range = c(1, 5),
                          # na.value = 1,
                          name = 'placements',
                          guide = 'none') +
    scale_fill_viridis(na.value = 'black', name = 'permanent\nplacements') +
    theme_graph()

ggsave(str_c(plots_path, 'focal_nodes.png'), 
       height = 6, width = 10, dpi = 600, scale = 1.25)
ggsave(str_c(paper_folder, 'fig_focal_nodes.png'), 
       height = 6, width = 10, dpi = 600, scale = 1.25)

Plotting

## plotting ----

Coarser community structure

# large_comms = which(sizes(communities) > 20)
# V(hiring_network_gc)$community_coarse = ifelse(
#     V(hiring_network_gc)$community %in% large_comms, 
#     V(hiring_network_gc)$community, 
#     NA)

## Big giant hairy ball
## ~13 sec
# tic()
# layout_net = hiring_network %>%
#     layout_with_stress() %>%
#     `colnames<-`(c('x', 'y')) %>%
#     as_tibble()
# toc()

## Focus on Oxford
## GC only; ~7 sec
# tic()
# layout_net = create_layout(hiring_network_gc, 'focus', focus = 512)
# toc()

## Stress majorization
## ~2 sec
tic()
layout_net = create_layout(hiring_network_gc, 'stress')
toc()
## 0.565 sec elapsed
layout_net %>%
    ## NB neither of these preserve the layout_tbl_graph class, which breaks things when we get to ggraph()
    # filter(total_placements > 0) %>% 
    # induced_subgraph(which(degree(., mode = 'out') > 0)) %>% 
    ggraph() +
    geom_edge_parallel(arrow = arrow(length = unit(.02, 'npc'), 
                                     angle = 15,
                                     type = 'closed'),
                       # spread = 5, 
                       alpha = .05) +
    geom_node_point(aes(#size = log10(out_centrality),
        alpha = log10(out_centrality),
        # color = as.factor(community)
        color = cluster_label
        # color = log10(out_centrality)
    ), size = 2) +
    # scale_color_brewer(palette = 'Set1', na.value = 'black') +
    scale_color_viridis_d(na.value = 'black', name = 'Semantic cluster') +
    # scale_size_discrete(range = c(2, 6)) +
    scale_alpha_continuous(range = c(.2, 1), 
                           name = 'Out centrality (log10)') +
    # scale_size_continuous(range = c(1, 3)) +
    theme_graph()

ggsave(str_c(plots_path, 'hairball.png'), 
       width = 12, height = 12)
ggsave(str_c(paper_folder, 'fig_hairball.png'), 
       width = 12, height = 12)

# hiring_network_gc %>%
#     induced_subgraph(!is.na(V(.)$cluster_lvl1)) %>%
#     ggraph() +
#     geom_node_label(aes(size = prestigious, 
#                         label = univ_name,
#                         # color = as.factor(community))) +
#                         fill = cluster_lvl4), color = 'black') +
#     geom_edge_fan(aes(linetype = aos_category, color = aos_category), 
#                   width = 1,
#                   arrow = arrow(length = unit(.01, 'npc')), 
#                   spread = 5) +
#     scale_edge_color_brewer(palette = 'Set1') +
#     scale_fill_brewer(palette = 'Set3', na.value = 'grey75') +
#     scale_size_discrete(range = c(3, 5)) +
#     theme_graph()


## Chord diagram
# hiring_network_gc %>%
#     # induced_subgraph(which(degree(., mode = 'out') > 0)) %>%
#     ggraph(layout = 'linear', sort.by = 'community_coarse', circular = TRUE) +
#     geom_edge_arc(arrow = arrow(length = unit(.01, 'npc')), alpha = .1) +
#     geom_node_point(aes(size = prestigious, 
#                         # color = as.factor(community))) +
#                         color = cluster_lvl4)) +
#     scale_color_brewer(palette = 'Set1', guide = 'none') +
#     theme_graph()

## Separate networks for each cluster
# cluster_ids = hiring_network %>% 
#     as_tibble() %>% 
#     split(., .$cluster_lvl4) %>% 
#     map(pull, name)
# 
# cluster_ids %>% 
#     map(~induced_subgraph(hiring_network, .))

# hiring_network %>% 
#     # filter(!is.na(cluster_lvl4)) %>% 
#     ggraph(layout = 'manual', 
#        node.positions = layout_net) +
#     geom_edge_fan(alpha = .1) +
#     geom_node_point(aes(color = cluster_label), show.legend = TRUE) +
#     # facet_nodes(vars(cluster_lvl4)) +
#     scale_color_viridis_d(na.value = 'grey90') +
#     theme_graph()

## output ----

Save university-level data with network statistics

write_rds(univ_df, str_c(data_folder, '03_univ_net_stats.rds'))


sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] tictoc_1.0.1     assertthat_0.2.1 ggbeeswarm_0.6.0 ggforce_0.3.3   
##  [5] broom_0.7.9      ggraph_2.0.5     tidygraph_1.2.0  igraph_1.2.6    
##  [9] forcats_0.5.1    stringr_1.4.0    dplyr_1.0.7      purrr_0.3.4     
## [13] readr_2.0.1      tidyr_1.1.3      tibble_3.1.4     ggplot2_3.3.5   
## [17] tidyverse_1.3.1 
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-152       fs_1.5.0           lubridate_1.7.10   RColorBrewer_1.1-2
##  [5] httr_1.4.2         tools_4.1.0        backports_1.2.1    bslib_0.2.5.1     
##  [9] utf8_1.2.2         R6_2.5.1           vipor_0.4.5        lazyeval_0.2.2    
## [13] DBI_1.1.1          mgcv_1.8-36        colorspace_2.0-2   withr_2.4.2       
## [17] tidyselect_1.1.1   gridExtra_2.3      compiler_4.1.0     cli_3.0.1         
## [21] rvest_1.0.1        xml2_1.3.2         plotly_4.9.4.1     labeling_0.4.2    
## [25] sass_0.4.0         scales_1.1.1       digest_0.6.27      rmarkdown_2.9     
## [29] pkgconfig_2.0.3    htmltools_0.5.1.1  dbplyr_2.1.1       highr_0.9         
## [33] htmlwidgets_1.5.3  rlang_0.4.11       readxl_1.3.1       rstudioapi_0.13   
## [37] jquerylib_0.1.4    farver_2.1.0       generics_0.1.0     jsonlite_1.7.2    
## [41] crosstalk_1.1.1    magrittr_2.0.1     Matrix_1.3-4       Rcpp_1.0.7        
## [45] munsell_0.5.0      fansi_0.5.0        viridis_0.6.1      lifecycle_1.0.0   
## [49] stringi_1.7.4      yaml_2.2.1         MASS_7.3-54        grid_4.1.0        
## [53] ggrepel_0.9.1      crayon_1.4.1       lattice_0.20-44    graphlayouts_0.7.1
## [57] haven_2.4.1        splines_4.1.0      hms_1.1.0          knitr_1.33        
## [61] pillar_1.6.2       codetools_0.2-18   reprex_2.0.0       glue_1.4.2        
## [65] evaluate_0.14      data.table_1.14.0  modelr_0.1.8       vctrs_0.3.8       
## [69] tzdb_0.1.2         tweenr_1.0.2       cellranger_1.1.0   gtable_0.3.0      
## [73] polyclip_1.10-0    xfun_0.24          viridisLite_0.4.0  beeswarm_0.4.0    
## [77] ellipsis_0.3.2